
*** Data Aggregation for Bunching Regression *** 

	**********------------------------------**********
	* Contents
		* Step 1: Create Grid 
			* define cells of width a_bin = 0.5kg and eei_bin = 0.5 units in EEI. 
			* fillin to add all potential cells for each country & month 
		* Step 2: Prepare Data for regression (commented out)
			* generate outcome variables at cell level 
			* collapse at bin-country-month level
			* fillin to balance panel
	*********** end of do-file: regressions in bunching_2_regress.do* 
		* Step 3: Baseline Regression 
			* bunching windows of 0.5 / 1.0 / 1.5 / 2.0 width 
			* outcome: levels, log + 1, ihs 
			* estimates store B* 
		* Step 4: Interaction Terms 
			* bunching windows of 0.5 / 1.0 / 1.5 / 2.0 width 
			* outcome: levels, log + 1, ihs 
			* estimates store I*
		* Step 5: Export tables to .tex format  	 
	**********------------------------------**********

capture log close 
clear all
set more off 

* Set directory
global server "R:\WSV2\TBu_AKe" 
cd "$server" // set working directory 

capture mkdir Bunching_NEW
global folder "${server}\Bunching_NEW"

cd "$folder"

capture mkdir Data
global store "R:\WSV2\TBu_AKe\Bunching_NEW\Data"
capture ssc install carryforward

*Define sub directories
global desktop "C:\Users\hy65byfe\Desktop\smerge_0712"
*cd "$desktop" // set working directory 

cd "$store"
log using bunching_1_bins_2703, replace 

cd

	** Load data ** 
use "R:\WSV2\TBu_AKe\Product Ban\Stata\Bunching_Empirics\WM_V10_precollapse_binbasis.dta"
 
gen kwh = ae // avoid confusion with E,e : yearly consumption as defined by regulation.  

	
	** Determine width and steps of spatial cells ***
 
	* a (capacity)
scalar int_a = 0.5 // steps
scalar low_a = 2.75 // minimum
scalar high_a = 15.25 // max 
	* eei 
scalar int_e = 0.5 // steps
scalar low_e = 13 // minimum
scalar high_e = 143 


***************************************
** Step 1: cut cells to grid 
***************************************

	* 1a: capacity a 
egen a_bin=cut(capacity),at(`=low_a'(0.5)`=high_a')
replace a_bin = a_bin + (`=int_a'/2) // name by midpoint.
	** bin labelled 4.25 is technically (4.251 to 4.749), after rename by midpoint: 4.5 is (4.251 to 4.749)

sum a_bin 
codebook a_bin 
	* result: 18 cells of width 0.5 
 
	* 1b: e (in kwh/year) as defined by the eei 
	    * note: energy consumption e in a cell is determined by a and eei
codebook eei_bin // 133 unique values  
label variable eei_bin "EEI in 0.5-unit bins"

gen e_max = 0 

sort eei_bin 
forvalues i=`=low_e'(1)`=high_e' {
 quietly sum kwh if eei_bin == `i' 
 local k = r(max)
 replace e_max = `k' if eei_bin==`i' // determine max energy consumption allowed for a given size and eei value 
}

sum a_bin eei_bin // summarize bin structure  


	* 1c: store cell borders in local macros and scalars ***
sum a_bin, meanonly
scalar max_a = `r(max)'	
scalar min_a = `r(min)'
	** number of steps 
scalar n_a = 1+ (`=max_a' - `=min_a')/0.5

*** e_bin based on eei 
sum eei_bin, meanonly
scalar max_eei = `r(max)'	
scalar min_eei = `r(min)'
scalar int_eei = 0.5
	** number of steps 
scalar n_e = 1+ (`=max_eei' - `=min_eei')/`=int_eei'

scalar list // n_e: 260, n_a: 25. 


		****************************
		*** 1d: create full grid *** 
		****************************

sort a_bin // capacity at bin-level 

gen ts = . 			 // temporary cell identifier
gen added_a = 0 	// to identify added observations in later checks 

sum a_bin, meanonly
	*** 0.5 is cell width, smallest bin starts at 2.5 kg    
forvalues i=`=min_a'(0.5)`=max_a' {
replace ts= (2*(`i') - 5) if a_bin==`i' 
	* example: 2*3 - 5 = 1 for smallest
}


replace ts = 1 if ts == .      // fix missings 
by ts, sort: gen panel_a=_n   // running list of all values 
tsset panel_a ts             // set:1 to 25 with gaps 
tsfill                      // fill 
tab ts                     // confirm that there are no gaps 

by ts, sort: replace panel_a=_n
replace added_a = 1 if added_a == . 	// identifies added observations when == 1 
drop if panel_a > 1 & added_a == 1     // reduce to only 1 observations per added cell 


gen a_binf = 0 
summ ts, meanonly 					// only integers, continuous list 
forvalues i=`=r(min)'(1)`=r(max)' {
replace a_binf= (0.5*(`i') + 2.5) if ts==`i' //  0.5 is step width, 2.5 is minimum.
}

tab a_binf 	// must be balanced: 3 to 15, at 0.5 steps.  

replace a_bin = a_binf if added_a == 1 	// re-assign for newly added cells.
 
tab a_bin 	// must be balanced: 3 to 15, at 0.5 steps.

drop ts a_binf // drop auxiliary variables  


sort eei_bin // eei at bin level 

gen ts = . // temporary cell identifier
gen added_eei = 0  // identifier for added cells 

sum eei_bin, meanonly
	*** 0.5 is step width, 13.5 is min value in eei 
forvalues i=`=min_eei'(0.5)`=max_eei' { 
	replace ts= (2*(`i') - 26) if eei_bin==`i' 
}

replace ts = 1 if ts == . 			 // fix missings 
by ts, sort: gen panel_e=_n 		//  running list of all values
tsset panel_e ts  				   // set to unbalanced panel 
tsfill                            // fill 
tab ts 							 // 1-260 w/o gaps. 

by ts, sort: replace panel_e=_n 
replace added_e = 1 if added_e == . 
drop if panel_e > 1 & added_e == 1 

gen eei_binf = 0 
sum ts, meanonly 		// only integers, continous list 
forvalues i=`r(min)'(1)`r(max)' { 
	replace eei_binf=((`i') + 26)/2 if ts==`i'
}

tab eei_binf 								   // check. starts at 13.5 goes to 143. 
replace eei_bin = eei_binf if added_eei == 1  // re-assigned values for expanded grid.   
tab eei_bin 								  // 13.5 to 143. 

drop ts eei_binf

		**  gen e_bin: upper limit e_max = EEI/100 (kappa + sigma*a)
		* Result: Trapezoid (e_max-e_max_n-1)* 0.5 / 2

gen e_max_digit = 0 	// complete cell structure by defining energy values within a cell 	
bysort eei_bin a_bin: replace e_max_digit = ((eei_bin+0.5)/100)*(47*a_bin+51.7) // upper limit is (EEI/100)(kappa + sigma*a)
tab e_max_digit if a_bin == 6 & eei_bin == 58.5 // check one cell to confirm: 196.xx  

gen e_min_digit = 0 		
bysort eei_bin a_bin: replace e_min_digit = ((eei_bin)/100)*(47*a_bin+51.7) // lower limit 
tab e_min_digit if a_bin == 6 & eei_bin == 58.5 // should be 195.xx 

gen e_bin = 0 		
replace e_bin = floor(e_max_digit)  	// rounding down to nearest integer --> value in kwh that a product of given capacity can have to get assigned to this eei bin   
label variable e_max_digit "e(max exact)" 
label variable e_bin "e(max integer)" 
	 
	*** Fillin combinations of both attributes on the grid ****
 
sort a_bin eei_bin
fillin a_bin eei_bin	// create all combinations
label variable _fillin "grid fillin"  
	 
	** Fix: fillin sets year to missing, then those observations would get dropped in collapse  
replace year = 2017 if year == . & _fillin == 1 // replace those created by _fillin 
codebook id if year == . 			// all have missing id --> no existing products affected. 
replace year = 2017 if year == . 	// fix rest on yearly basis. 
replace date = 683 if date == . 	// fix rest on monthly basis. 
 
 
**** BREAK 1 ******
save bunching_1_bins_1512, replace 
cd // locate file. 
*******************

	** Step 2a: fix missing values **
drop if a_bin == .					
drop if eei_bin == . 	

replace added_a = 1 if added_a == . // 1 if added cell, 0 otherwise.
replace added_e = 1 if added_e == . // 1 if added cell, 0 otherwise.
tab added_* 					 
	
egen cell=group(eei_bin a_bin) 		// Grid of trapezoids with parallel vertical sides, defined by capacity a and index eei  
codebook cell 
	* result: 6500 (25*260)
	
	** Define segments in the product space **
gen r = 0 // restricted
gen b = 0 // bunching 
gen u = 0 // unrestricted 

replace r = 1 if eei_bin >= 59 				   // exactly 59 falls under restricted
replace b = 1 if eei_bin < 59 & eei_bin >= 57 // 2-unit window   
replace u = 1 if eei_bin < 57 	  			  // lower than eei = 57 

	** Step 2b: outcome by cell: country-specific ** 	
	
	** yearly **
	
	** cell-level sales, country specific
by eei_bin ccode, sort: egen cell_sales=sum(units) // all years 
by eei_bin year ccode, sort: egen cell_year_sales=sum(units) // by year 

	** cell-level product count 
by eei_bin year ccode id, 	sort: gen n=_n 	
replace n=0 			if n>1 &n!=.  
replace n=0 if added_a == 1
replace n=0 if added_e == 1 
by eei_bin year ccode, 		sort: egen cell_year_count=sum(n) // by year 
drop n	
	
by eei_bin ccode id, 	sort: gen n=_n 	
replace n=0 			if n>1 &n!=. 
replace n=0 if added_a == 1
replace n=0 if added_e == 1 
by eei_bin ccode, 		sort: egen cell_count=sum(n) // all years 
drop n 

	** cell-level sales in R and B  
by eei_bin year ccode, sort: egen cell_year_sales_R=sum(units) if r== 1 // by year 
by eei_bin year ccode, sort: egen cell_year_sales_B=sum(units) if b == 1 // by year 

	** cell-level product count in R and B
by eei_bin year ccode id, 	sort: gen n=_n 	
replace n=0 			if n>1 &n!=.  
replace n=0 if added_a == 1
replace n=0 if added_e == 1 
replace n=0 if r == 0 
by eei_bin year ccode, 		sort: egen cell_year_count_R =sum(n) // by year 
drop n	
	
by eei_bin year ccode id, 	sort: gen n=_n 	
replace n=0 			if n>1 &n!=.  
replace n=0 if added_a == 1
replace n=0 if added_e == 1 
replace n=0 if b == 0 
by eei_bin year ccode, 		sort: egen cell_year_count_B =sum(n) // by year 
drop n	

sum cell_* // display summary stats  

sum cell if a_bin == 6 & eei_bin == 58 // identify single cell in B to verify correct grid.  
sum cell_* if cell == 2232 & year == 2011 & ccode == 3 		// 18. 
codebook id if cell == 2232 & year == 2011 & ccode == 3 	// 18? check. 

	** monthly **

	** cell-level sales, country specific
by eei_bin date ccode, sort: egen cell_date_sales=sum(units) // by date 

	** cell-level product count 
by eei_bin date ccode id, 	sort: gen n=_n 	
replace n=0 			if n>1 &n!=.  
replace n=0 if added_a == 1
replace n=0 if added_e == 1 
by eei_bin date ccode, 		sort: egen cell_date_count=sum(n) // by date 
drop n	

	** cell-level sales in R and B  
by eei_bin date ccode, sort: egen cell_date_sales_R=sum(units) if r== 1 // by date 
by eei_bin date ccode, sort: egen cell_date_sales_B=sum(units) if b == 1 // by date 

	** cell-level product count 
by eei_bin date ccode id, 	sort: gen n=_n 	
replace n=0 			if n>1 &n!=.  
replace n=0 if added_a == 1
replace n=0 if added_e == 1 
replace n=0 if r == 0 
by eei_bin date ccode, 		sort: egen cell_date_count_R =sum(n) // by date 
drop n	
	
by eei_bin date ccode id, 	sort: gen n=_n 	
replace n=0 			if n>1 &n!=.  
replace n=0 if added_a == 1
replace n=0 if added_e == 1 
replace n=0 if b == 0 
by eei_bin date ccode, 		sort: egen cell_date_count_B =sum(n) // by date 
drop n	

	** verify structure for single cell 
sum cell_date* if cell == 2232 & year == 2011 & month == 6 & ccode == 3 // 18 obs, 25 products.  
codebook id if cell == 2232 & year == 2011 & month == 6 & ccode == 3 // 18? check.
	** total in bin 
sum cell_date* if eei_bin == 58.5 & year == 2011 & month == 6 & ccode == 3 // 18 obs, 25 products.  
codebook id if eei_bin == 58.5 & year == 2011 & month == 6 & ccode == 3

	** replace missings for added cells **
		** Set outcomes to zero, but price to . , avoids problems in collapse 
	
foreach var of varlist price_all {
replace `var' = . if added_a == 1 
replace `var' = . if added_e == 1 
}

foreach var of varlist units cell_year_sales cell_year_count cell_date_sales cell_date_count {
replace `var' = 0 if added_a == 1 
replace `var' = 0 if added_e == 1 
}

	** replace missings in time structure 
replace year = 2017 if added_a == 1 // missing not tolerated by collapse  
replace year = 2017 if added_e == 1 
replace date = 683 if added_a == 1 
replace date = 683 if added_e == 1 

gsort date +year +month 
bysort date: carryforward year, replace
bysort date: carryforward month, replace 


**** BREAK 2 ******
save bunching_1_bins_1512, replace 
cd // locate file. 
*******************
	
	*************************************************************
	** Step 2c : collapse product-level data to bin-level data ** 

	************************
	** date-country level **
	************************

preserve 

		*** monthly *** 
sort eei_bin ccode year month // sort order  
gen price2 = price_all // duplicated to get total expenditure and mean price 
gen kwh_max = kwh // duplicate to keep max value (within-cell bunchpoint)
replace ccode = 0 if missing(ccode)
#delimit ;
collapse (max) cell_year_sales cell_year_count cell_date_sales cell_date_count kwh_max e_max_digit e_min_digit cell
		 (sum) units units_clean price_all b r u  
		 (mean) e_bin a_bin capacity kwh price2 eei age_xc age_country 
		 (firstnm) added_a added_e year month, 
		 by(eei_bin ccode date); 
#delimit cr
order cell_*, last 
order price_all price2, last 
order capacity kwh units added_a added_e, last

	** 2d: clean up data structure to balanced panel (cell-country-period level)

	** Fillin to balance panel (assume: missing = 0, i.e. product-combo not offered/sold at time t)
fillin eei_bin ccode date 					// all combinations  
codebook eei_bin 							// 260*8*157 (157 is number of monthly periods)
xtset eei_bin								// check if balanced 

tab _fillin 							
label variable _fillin "bin-date_fill" 
replace added_a = 1 if added_a == . 	//   
drop added_e 							// identical to added_a after collapse (added cells by either attribute identify same sub-set) 

	** Time variables 
gsort date +year +month 				// fillin relies on date, month and year needed for regression
bysort date: carryforward year, replace
bysort date: carryforward month, replace 


	** Replace missings with zero for outcome, set price missing if zero
foreach var of varlist cell_date_count cell_date_sales units {
	replace `var' = 0 if `var' == . 
	replace `var' = 0 if added_a == 1 & _fillin == 0 // from collapse structure
}

foreach var of varlist price_all price2 {
	replace `var' = . if `var' == 0  // zero price does not exist in this set, but possible in other cases.  
}

	** Fill in missings in a_bin and e_bin from cell address 

	** capacity 
gsort cell +a_bin
bysort cell: carryforward a_bin, gen(a_bin3)

quietly list cell a_bin a_bin3, clean noobs

replace a_bin = a_bin3 
drop a_bin3

	** eei in bins 
gsort cell +eei_bin
bysort cell: carryforward eei_bin, gen(eei_bin3)
quietly list cell eei_bin eei_bin3, clean noobs

replace eei_bin = eei_bin3 if eei_bin == .  
drop eei_bin3

	** fix e_bin to eliminate missings based on a_bin, eei_bin.  	
bysort eei_bin a_bin: replace e_max_digit = ((eei_bin+0.5)/100)*(47*a_bin+51.7) // within-cell bunchpoint 
tab e_max_digit if a_bin == 6 & eei_bin == 58.5 // check individual cell 		
replace e_bin = floor(e_max_digit) // rounding down to nearest integer  

label variable e_max_digit "e(max exact)" 
label variable e_bin "e(max integer)" 

gsort cell +e_bin
bysort cell: carryforward e_bin, gen(e_bin3)
quietly list cell e_bin e_bin3, clean noobs

replace e_bin = e_bin3 
drop e_bin3 

	** check  
tab eei_bin, missing // no more missings.  
codebook e_bin // no more missings.  

	** define B (bunching) and R (restricted) 

gen restricted = 0 
gen bunching = 0 

	** bunching based on eei_bins 
replace restricted = 1 if eei_bin >= 59 
replace bunching = 1 if eei_bin < 59 & eei_bin >= 57 


gen space = 0 // categorical variables for all product segments: 0 for U, 1 for R, 2 for B  
replace space = 1 if restricted == 1 
replace space = 2 if bunching == 1 

	** align cell order to put bunching cells first for regression 
gsort +year +month +ccode -space +eei_bin // sort order avoids problems in spatial matrix 
by year month ccode: gen id_bin = _n // ordered list 
tab id_bin if bunching == 1 

	** simplify labelling for outcomes 
gen csd = cell_date_sales 
gen ccd = cell_date_count
gen csy = cell_year_sales 
gen ccy = cell_year_count

**************************
save bunching_2_collapse_1512, replace 
****************************

log close 

clear 

exit 
